Fuente de datos

Recursos de información

Datos

# Lectura de datos
datos <- read.csv(file = "https://www.datos.gov.co/api/views/25nw-b8kq/rows.csv?accessType=DOWNLOAD")

# Depuración de datos
library(tidyverse)
library(Hmisc)
datos %>% 
  select(depto = Departamentos,
         total_pred = Total.predios,
         total_bov = Total.Bovinos...2017,
         area_agri = Area.total.sembrada.del.sector.agricola,
         area_fores = Agroforestal..Ha.,
         area_cons = ConservaciOn.de.Suelos...Ha.,
         area_gan = Ganadera..Ha.,
         em_agri = Emisiones.Sector.AgrIcola..Miles.de.ton.CO2.eq.,
         em_fores = Emisiones.Sector.Forestal..Miles.de.ton.CO2.eq.,
         em_pec = Sector.Pecuario..Miles.de.ton.CO2.eq.) %>% 
  mutate(depto = capitalize(tolower(depto)),
         depto = gsub("Boyaca", "Boyacá", depto, ignore.case = TRUE),
         depto = gsub("Distrito capital", "Cundinamarca", depto,
                      ignore.case = TRUE),
         sum_em = em_agri + em_fores + em_pec) -> 
  datos2

datos2

Análisis exploratorio

Densidades originales

datos2 %>% 
  select_if(is.numeric) %>% 
  gather() %>% 
  ggplot(data = ., aes(x = value)) +
  facet_wrap(~key, scales = "free") +
  geom_density() +
  ggtitle("Escala original")

Distribuciones (logaritmos)

datos2 %>% 
  select_if(is.numeric) %>% 
  gather() %>% 
  ggplot(data = ., aes(x = value)) +
  facet_wrap(~key, scales = "free") +
  geom_density() +
  scale_x_log10() +
  ggtitle("Escala logarítmica")

Correlaciones

library(corrplot)
library(RColorBrewer)
datos2 %>% 
  select_if(is.numeric) %>% 
  select(-c(em_agri:em_pec)) %>% 
  mutate_if(is.numeric, log) %>% 
  filter_all(all_vars(.>0)) %>% 
  cor(use = "pairwise.complete.obs") %>% 
  corrplot(type = "upper", diag = FALSE, tl.srt = 45, tl.col = "black", 
           method = "pie", addgrid.col = "black", 
           col = brewer.pal(n = 8, name = "RdBu"))

Distribución por departamento

library(jcolors)

datos2 %>% 
  select(which(sapply(datos2,class) %in% c("numeric", "integer")), depto) %>% 
  gather(key = "variable", value = "valor", -depto) %>% 
  mutate(valor = log10(valor)) %>% 
  ggplot(data = ., aes(x = depto, y = valor, fill = depto)) +
  geom_boxplot() +
  scale_fill_jcolors(palette = "pal3") +
  facet_wrap(facets = ~variable, scales = "free", ncol = 3) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), 
        legend.position = "none")

Dispersiones

Graficación base (graphics)

plot(datos2$total_bov, datos2$sum_em, log = "xy")
abline(lm(log10(datos2$sum_em) ~ log10(datos2$total_bov)), 
       col = "red")

Graficación con ggplot2

datos2 %>% 
  ggplot(., aes(x = total_bov, y = sum_em)) +
  geom_point() +
  scale_x_log10() +
  scale_y_log10() +
  geom_smooth(method = "lm", se = FALSE, color = "red")

datos2 %>% 
  ggplot(., aes(x = total_bov, y = sum_em, color = depto)) +
  geom_point() +
  scale_x_log10() +
  scale_y_log10() +
  geom_smooth(method = "lm", se = FALSE) +
  scale_color_jcolors(palette = "pal3") +
  theme(legend.position = "bottom") +
  labs(color = "")

Todas vs sumatoria de emisiones

datos2 %>% 
  select(-c(em_agri:em_pec)) %>% 
  gather(key = "variable", value = "valor", -c(sum_em, depto)) %>% 
  ggplot(data = ., aes(x = valor, y = sum_em, color = depto)) +
  facet_wrap(~ variable, scales = "free") +
  geom_point(alpha = 0.3) +
  scale_x_log10() +
  scale_y_log10() +
  geom_smooth(method = "lm", se = FALSE) +
  scale_color_jcolors(palette = "pal3") +
  theme(legend.position = "bottom") +
  labs(color = "")

Análisis de componentes principales

Resumen ACP

library(FactoMineR)
datos2 %>% 
  select(-c(em_agri:em_pec, depto)) %>% 
  mutate(area_fores = log10(area_fores+1),
         area_cons = log10(area_cons+1),
         area_gan = log10(area_gan+1),
         total_pred = log10(total_pred),
         total_bov = log10(total_bov),
         area_agri = log10(area_agri),
         sum_em = log10(sum_em))-> datos_acp
acp <- PCA(X = datos_acp, scale.unit = TRUE, graph = FALSE)
summary(acp)
## 
## Call:
## PCA(X = datos_acp, scale.unit = TRUE, graph = FALSE) 
## 
## 
## Eigenvalues
##                        Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6   Dim.7
## Variance               3.098   1.132   0.948   0.731   0.573   0.325   0.195
## % of var.             44.255  16.173  13.539  10.440   8.179   4.636   2.779
## Cumulative % of var.  44.255  60.428  73.967  84.406  92.586  97.221 100.000
## 
## Individuals (the 10 first)
##                Dist    Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3
## 1          |  3.175 | -2.648  0.719  0.696 | -0.022  0.000  0.000 |  0.458
## 2          |  3.254 |  2.562  0.673  0.620 | -0.375  0.039  0.013 |  0.722
## 3          |  1.456 |  0.610  0.038  0.175 |  0.540  0.082  0.137 |  0.828
## 4          |  1.537 |  0.457  0.021  0.088 |  0.076  0.002  0.002 |  1.431
## 5          |  2.577 | -2.325  0.554  0.814 |  0.172  0.008  0.004 | -0.038
## 6          |  3.687 | -2.613  0.700  0.502 | -1.376  0.531  0.139 |  1.728
## 7          |  1.457 | -0.177  0.003  0.015 |  0.137  0.005  0.009 |  1.206
## 8          |  2.166 | -0.925  0.088  0.182 | -0.410  0.047  0.036 | -0.514
## 9          |  1.989 | -1.402  0.201  0.497 |  0.787  0.174  0.157 | -0.967
## 10         |  1.652 | -0.617  0.039  0.139 |  0.902  0.228  0.298 | -0.420
##               ctr   cos2  
## 1           0.070  0.021 |
## 2           0.175  0.049 |
## 3           0.229  0.323 |
## 4           0.686  0.866 |
## 5           0.000  0.000 |
## 6           1.000  0.220 |
## 7           0.487  0.686 |
## 8           0.089  0.056 |
## 9           0.313  0.236 |
## 10          0.059  0.065 |
## 
## Variables
##               Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3    ctr   cos2
## total_pred |  0.657 13.954  0.432 |  0.259  5.928  0.067 |  0.540 30.768  0.292
## total_bov  |  0.904 26.387  0.817 |  0.050  0.217  0.002 | -0.027  0.079  0.001
## area_agri  |  0.715 16.519  0.512 |  0.015  0.019  0.000 | -0.338 12.046  0.114
## area_fores |  0.307  3.041  0.094 |  0.787 54.720  0.619 |  0.063  0.425  0.004
## area_cons  |  0.403  5.247  0.163 | -0.538 25.608  0.290 |  0.613 39.714  0.376
## area_gan   |  0.564 10.268  0.318 | -0.389 13.400  0.152 | -0.361 13.783  0.131
## sum_em     |  0.873 24.584  0.762 | -0.035  0.108  0.001 | -0.174  3.185  0.030
##             
## total_pred |
## total_bov  |
## area_agri  |
## area_fores |
## area_cons  |
## area_gan   |
## sum_em     |

Componente 1 vs Componente 2

  • Se deben ingresar a la base de datos original (datos2) las nuevas coordenadas obtenidas con ACP
datos2$cp1 <- acp$ind$coord[,1]
datos2$cp2 <- acp$ind$coord[,2]
datos2$cp3 <- acp$ind$coord[,3]
library(factoextra)
library(ggpubr)
ggarrange(
  datos2 %>% 
  ggplot(data = ., aes(x = cp1, y = cp2, color = depto)) +
  geom_point() +
  geom_vline(xintercept = 0, color = "red", lty = 2, size = 0.5) +
  geom_hline(yintercept = 0, color = "red", lty = 2, size = 0.5) +
  scale_color_jcolors(palette = "pal3") +
  theme(legend.position = "bottom") +
  labs(color = ""),
  
  fviz_pca_var(acp, axes = c(1,2)),
  
  ncol = 2
)

CP1 vs CP3

ggarrange(
  datos2 %>% 
  ggplot(data = ., aes(x = cp1, y = cp3, color = depto)) +
  geom_point() +
  geom_vline(xintercept = 0, color = "red", lty = 2, size = 0.5) +
  geom_hline(yintercept = 0, color = "red", lty = 2, size = 0.5) +
  scale_color_jcolors(palette = "pal3") +
  theme(legend.position = "bottom") +
  labs(color = ""),
  
  fviz_pca_var(acp, axes = c(1,3)),
  
  ncol = 2
)

CP2 vs CP3

ggarrange(
  datos2 %>% 
  ggplot(data = ., aes(x = cp2, y = cp3, color = depto)) +
  geom_point() +
  geom_vline(xintercept = 0, color = "red", lty = 2, size = 0.5) +
  geom_hline(yintercept = 0, color = "red", lty = 2, size = 0.5) +
  scale_color_jcolors(palette = "pal3") +
  theme(legend.position = "bottom") +
  labs(color = ""),
  
  fviz_pca_var(acp, axes = c(2,3)),
  
  ncol = 2
)

CP1, CP2 y CP3

library(plotly)

plot_ly(data = datos2, x = ~cp1, y = ~cp2, z = ~cp3, color = ~depto) %>% 
  add_markers()

Análisis de varianza

\[H_0: \mu_{Boyacá} = \mu_{Cundinamarca} = \mu_{Meta} = \mu_{Tolima} \\ H_1: \mu_i \neq \mu_j\]

Modelo originales

modelo_anova1 <- aov(sum_em ~ depto, data = datos2)
summary(object = modelo_anova1)
##              Df   Sum Sq Mean Sq F value Pr(>F)    
## depto         3  8946782 2982261   32.16 <2e-16 ***
## Residuals   309 28652051   92725                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 2 observations deleted due to missingness

Modelo logaritmos

modelo_anova2 <- aov(sum_em ~ depto, data = datos2 %>% 
                      mutate(sum_em = log(sum_em)))
summary(object = modelo_anova2)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## depto         3    175   58.34   76.72 <2e-16 ***
## Residuals   309    235    0.76                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 2 observations deleted due to missingness

Residuales originales

par(mfrow = c(2,2))
plot(modelo_anova1)

Residuales logarítmicos

par(mfrow = c(2,2))
plot(modelo_anova2)

Comparaciones de medias - Tukey

TukeyHSD(x = modelo_anova1, conf.level = 0.95)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = sum_em ~ depto, data = datos2)
## 
## $depto
##                           diff        lwr       upr     p adj
## Cundinamarca-Boyacá   26.25064  -75.97960  128.4809 0.9107847
## Meta-Boyacá          598.73355  436.23721  761.2299 0.0000000
## Tolima-Boyacá         47.73825  -87.29744  182.7739 0.7978420
## Meta-Cundinamarca    572.48291  409.03952  735.9263 0.0000000
## Tolima-Cundinamarca   21.48760 -114.68624  157.6614 0.9770957
## Tolima-Meta         -550.99530 -736.72994 -365.2607 0.0000000

Comparación de medias gráfico

library(broom)
tidy(TukeyHSD(x = modelo_anova1, conf.level = 0.95)) %>% 
  ggplot(data = ., aes(x = reorder(comparison, estimate), y = estimate,
                       ymin = conf.low, ymax = conf.high)) +
  geom_errorbar(width = 0.2) +
  geom_point() +
  geom_hline(yintercept = 0, lty =2, color = "red", size = 0.5) +
  labs(x = "") +
  coord_flip()